* Erin Huffer February 2021
* Process David Leip Turnout data to get community voting rates

cap log close
clear all
set more off, perm

global temp "$raw/Voter/dave_leip/temp"  

/* IPEDS data set */ 

clear 
import excel "$raw\Voter\dave_leip\IPEDS\IPEDS.xlsx", sheet("Sheet1") firstrow
gen ID_IPEDS = ipeds_id
gen ipeds_city_state = ipeds_city + " " + ipeds_stateab 
gen fips = ipeds_countyfips

* Add code to do town-level Dave Leip instead of county-level
gen town_name = ipeds_city
gen statefips = ipeds_statefips
save "$temp\ipeds.dta", replace

/* Import Dave Leip data */

clear

foreach year in 1992 1996 2000 2004 2008 2012 2016 {
	clear
	import excel "$raw/Voter/dave_leip/`year'.xlsx", sheet("State VTO") firstrow case(lower)

	gen year4 = `year'
	
	*Generates relevant ratios
	gen turnout_total = turnout/totalpop
	gen reg_total = totalreg/totalpop
	gen turnout_vap = turnout/totalvap
	gen turnout_reg = turnout/totalreg
	gen reg_vap = totalreg/totalvap 

	save "$raw/Voter/dave_leip/`year'_state.dta", replace
}


* Read in Dave Leip counties and merge with IPEDS

foreach year in 1992 1996 2000 2004 2008 2012 2016 {
	clear
	import excel "$raw/Voter/dave_leip/`year'.xlsx", sheet("County VTO") firstrow case(lower)

	gen year4 = `year'
	destring fips, replace
	destring statefips, replace
	
	*Generates relevant ratios
	gen turnout_total = turnout/totalpop
	gen reg_total = totalreg/totalpop
	gen turnout_vap = turnout/totalvap
	gen turnout_reg = turnout/totalreg
	gen reg_vap = totalreg/totalvap
	
	drop if state==""
	rename state name

	save "$raw/Voter/dave_leip/`year'_county.dta", replace
	
	*Merge Dave Leip county data to IPEDS counties
	merge m:m fips using "$temp\ipeds.dta"
	keep if _merge==3 
	drop _merge 
	drop if fips==.
	save "$temp/`year'_ipeds_county.dta", replace
}


* Read in Dave Leip towns and merge with IPEDS

foreach year in 1992 1996 2000 2004 2008 2012 2016 {
	clear
	import excel "$raw/Voter/dave_leip/`year'.xlsx", sheet("Town VTO") firstrow case(lower)

	gen year4 = `year'
	destring statefips, replace
	destring fips, replace
	
	*Generates relevant ratios
	gen turnout_total = turnout/totalpop
	gen turnout_reg = turnout/totalreg
	gen reg_total = totalreg/totalpop

	save "$raw/Voter/dave_leip/`year'_town.dta", replace
	
	rename town town_name
	format town_name town_name %20s
	drop if town_name==""
	destring fips, replace
	
	*Merge Dave Leip town data to IPEDS towns 
	merge m:m town_name statefips using "$temp/ipeds.dta"
	keep if _merge==3 
	drop _merge 
	drop if fips==.
	save "$temp/`year'_ipeds_town.dta", replace
}

* Create master state data file
clear
use "$raw/Voter/dave_leip/1992_state.dta"
foreach year in 1996 2000 2004 2008 2012 2016 {
	append using "$raw/Voter/dave_leip/`year'_state.dta", force
}
drop if state==""
rename state name
save "$raw/Voter/dave_leip/state.dta", replace

* Create master county data file without IPEDS
clear
use "$raw/Voter/dave_leip/1992_county.dta"
foreach year in 1996 2000 2004 2008 2012 2016 {
	append using "$raw/Voter/dave_leip/`year'_county.dta", force
}
save "$temp/county.dta", replace

* Create master county data file with IPEDS
clear
use "$temp/1992_ipeds_county.dta"
foreach year in 1996 2000 2004 2008 2012 2016 {
	append using "$temp/`year'_ipeds_county.dta", force
}
save "$temp/ipeds_county.dta", replace

* Create master town data file without IPEDS
clear
use "$raw/Voter/dave_leip/1992_town.dta"
foreach year in 1996 2000 2004 2008 2012 2016 {
	append using "$raw/Voter/dave_leip/`year'_town.dta", force
}
rename town town_name
save "$temp/town.dta", replace

* Create master town data file with IPEDS
clear
use "$temp/1992_ipeds_town.dta"
foreach year in 1996 2000 2004 2008 2012 2016 {
	append using "$temp/`year'_ipeds_town.dta", force
}

save "$temp/ipeds_town.dta", replace

clear

use "$temp/ipeds_town.dta"



/* Add information (including IPEDS IDs) to the universities represented in the baseanalysisfile_voter_lotto data file so that we can merge with Dave Leip */

clear
use "$data/baseanalysisfile_voter_lotto.dta", clear

* Closest election PRIOR to grad year
gen year_elec = .
replace year_elec = 2004 if proj_year12<=2008
replace year_elec = 2008 if proj_year12>2008 & proj_year12<=2012
replace year_elec = 2012 if proj_year12>2012 & proj_year12<=2016
replace year_elec = 2016 if proj_year12>2016 


rename ID_FSC_firstinst ID_FSC
rename college_name_firstinst college_name
keep sasid ID_FSC year_elec college_name
gen year4 = year_elec

g sub = substr(ID_FSC,1,6) +"-00"
qui replace ID_FSC = sub
drop sub 
replace ID_FSC = "" if ID_FSC=="-00"
merge m:1 ID_FSC using "$data_clean/FSC.dta"   /*, assert(match using) keep(match) nogen */
drop if _merge==2
drop _merge
merge m:1 ID_IPEDS using "$data_clean/IPEDS.dta" /*, assert(match using) keep(match) nogen */
drop if _merge==2
drop _merge

save "$temp\ipeds_fsc_sasid.dta", replace

clear 
use "$temp\ipeds_fsc_sasid.dta"

merge m:1 ID_IPEDS using "$temp\ipeds.dta"
drop if _merge==2
drop _merge

keep sasid ID_FSC ID_IPEDS college_name ipeds_ow year_elec year4 ipeds_statefips ipeds_stateab
save "$temp\sas_ipeds_fsc_idsonly.dta", replace 


/* Merge with the Dave Leip COUNTY data */

clear
use "$temp\sas_ipeds_fsc_idsonly.dta"
merge m:1 ID_IPEDS year4 using "$temp\ipeds_county.dta"

/*
Most of the students appear in schools that we have coverage for!

    Result                           # of obs.
    -----------------------------------------
    not matched                        47,561
        from master                     6,566  (_merge==1)
        from using                     40,995  (_merge==2)

    matched                            13,190  (_merge==3)
    -----------------------------------------

Here, we care about the _merge==1 students (in our dataset, but not matched to a county in the Dave Leip data).
*/

count if _merge==1 & ID_FSC==""
* 6,255 unmatched students didn't go to college; we'll put them down for Boston.

count if _merge==1 & ID_FSC!=""
* 311 unmatched students report going to college. We need to figure out who they are.
  
tab ipeds_owner if _merge==1 & ID_FSC!=""
/*

 college ownership |      Freq.     Percent        Cum.
-------------------+-----------------------------------
private for-profit |         99       31.94       31.94
 private non-profi |        187       60.32       92.26
            public |         24        7.74      100.00
-------------------+-----------------------------------
             Total |        310      100.00

*/
tab college_name if _merge==1 & ID_FSC!=""
* 36 schools

/*
                           college_name |      Freq.     Percent        Cum.
----------------------------------------+-----------------------------------
             ARMSTRONG STATE UNIVERSITY |          2        0.64        0.64
                 DANIEL WEBSTER COLLEGE |          6        1.93        2.57
DELAWARE TECHNICAL AND CC -STANTON/WI.. |          1        0.32        2.89
               EVEREST COLLEGE - AURORA |          1        0.32        3.22
              EVEREST COLLEGE - CHELSEA |          7        2.25        5.47
          EVEREST INSTITUTE - BISSONNET |          1        0.32        5.79
           EVEREST INSTITUTE - BRIGHTON |         29        9.32       15.11
        EVEREST INSTITUTE - NORTH MIAMI |          1        0.32       15.43
             EVEREST UNIVERSITY - TEMPE |          1        0.32       15.76
          FLINT RIVER TECHNICAL COLLEGE |          1        0.32       16.08
            HESSER COLLEGE - MANCHESTER |          1        0.32       16.40
                      HUSSON UNIVERSITY |         10        3.22       19.61
                ITT TECHNICAL INSTITUTE |         15        4.82       24.44
              KENNESAW STATE UNIVERSITY |          2        0.64       25.08
LE CORDON BLEU COLLEGE OF CULINARY AR.. |          1        0.32       25.40
LE CORDON BLEU COLLEGE OF CULINARY AR.. |          3        0.96       26.37
         LINCOLN COLLEGE OF NEW ENGLAND |          2        0.64       27.01
                   LYNDON STATE COLLEGE |         14        4.50       31.51
                      MOUNT IDA COLLEGE |         78       25.08       56.59
               MOUNT WASHINGTON COLLEGE |          3        0.96       57.56
      MOUNT WASHINGTON COLLEGE - NASHUA |          1        0.32       57.88
NEW ENGLAND INSTITUTE OF ART AND COMM.. |          6        1.93       59.81
                PHILADELPHIA UNIVERSITY |          4        1.29       61.09
         SANFORD BROWN COLLEGE - BOSTON |          2        0.64       61.74
  SANTA FE UNIVERSITY OF ART AND DESIGN |          1        0.32       62.06
      SCHOOL OF THE MUSEUM OF FINE ARTS |          1        0.32       62.38
  SOUTHERN POLYTECHNIC STATE UNIVERSITY |          1        0.32       62.70
                   UNIVERSIDAD DEL ESTE |          1        0.32       63.02
              UNIVERSIDAD METROPOLITANA |          1        0.32       63.34
       UNIVERSITY OF ALASKA - SOUTHEAST |          1        0.32       63.67
                     UNIVERSITY OF IOWA |          1        0.32       63.99
                  UNIVERSITY OF PHOENIX |         18        5.79       69.77
      UNIVERSITY OF PUERTO RICO-BAYAMON |          1        0.32       70.10
      UNIVERSITY OF PUERTO RICO-HUMACAO |          1        0.32       70.42
              VIRGINIA COLLEGE - BILOXI |          1        0.32       70.74
                       WHEELOCK COLLEGE |         91       29.26      100.00
----------------------------------------+-----------------------------------
                                  Total |        311      100.00
*/

* One college with an FSC ID but no IPEDS ID (Everest College - Aurora)

save "$temp\sas_fsc_ipeds_county_merge.dta", replace 

clear
use "$temp\sas_fsc_ipeds_county_merge.dta"

* Save the observations for whom the merge worked separately.
keep if _merge==3
drop _merge
save "$temp\sas_fsc_ipeds_county_merge3.dta", replace

* Save the unmerged observations separately (_merge==1)
clear
use "$temp\sas_fsc_ipeds_county_merge.dta"
keep if _merge==1
drop _merge

save "$temp\sas_fsc_ipeds_county_merge1.dta", replace

* Pulled 311 students into a data file, identified the ZIP of their chosen college, and merged with a ZIP/county FIPS crosswalk to get county data. SASID + FIPS county code is in this Excel file.
* NOTE: Students at ITT Technical Institute, University of Phoenix, and all Everest institutions are identified as being located in Boston (likely taking courses online).

clear 
import excel "$temp\universities_no_ipeds_slim_county.xlsx", sheet("Sheet1") firstrow
format %12.0g sasid

merge 1:1 sasid using "$temp\sas_fsc_ipeds_county_merge1.dta"

* Set students who didn't go to college as being in Boston (FIPS==25025)
replace fips=25025 if ID_FSC==""
keep sasid fips ID_FSC college_name year_elec year4 ID_IPEDS ipeds_ownership

merge m:m fips year4 using "$temp\county.dta"
keep if _merge!=2
keep sasid ID_FSC college_name year_elec year4 ID_IPEDS ipeds_ownership name stateab totalpop totalvap totalreg ballots turnout democrat republican independent statefips countyfips fips lsad_trans turnout_total turnout_vap turnout_reg reg_vap totalvap_census reg_total activereg democratic 

append using "$temp\sas_fsc_ipeds_county_merge3.dta"
*19,756 observations--matches the base analysis file! 

save "$data\sas_fsc_ipeds_county_full.dta", replace

clear
use "$data\sas_fsc_ipeds_county_full.dta"
keep sasid year_elec ID_FSC ID_IPEDS ipeds_ownership college_name turnout_total turnout_vap turnout_reg reg_vap totalvap_census reg_total

save "$data\sas_fsc_ipeds_county_slim.dta", replace

/*
clear
use "$temp\sas_fsc_ipeds_county_slim.dta"
merge 1:1 sasid using "${data}/baseanalysisfile_voter_lotto.dta"
gen lnturnout = ln(turnout_reg)
drop _merge
save "$temp\baseanalysisfile_voter_lotto_EMHcounty.dta", replace
save "$data/baseanalysisfile_voter_lotto_EMHcounty.dta", replace

* 5 obs with missing turnout/total pop 
	* They're in school in PR, Venezuela? and one dude up in Alaska
* 111 obs with missing turnout/registered voters, and that's due to missing Dave Leip data.
*/


*************************

/* DAVE LEIP TOWN DATA */ 

*************************

clear
use "$temp\sas_ipeds_fsc_idsonly.dta"
merge m:m ID_IPEDS year4 using "$temp\ipeds_town.dta"

/*
Most of the students appear in schools that we have coverage for!

     Result                           # of obs.
    -----------------------------------------
    not matched                        11,747
        from master                     9,929  (_merge==1)
        from using                      1,818  (_merge==2)

    matched                             9,828  (_merge==3)
    -----------------------------------------

Here, we care about the _merge==1 students (in our dataset, but not matched to a county in the Dave Leip data).
*/

*m:m merge created one duplicate SASID in Rutland, VT--we drop the Town copy in favor of the City copy
drop if sasid==1093958718 & subdivision=="Town"

count if _merge==1 & ID_FSC==""
* 6,255 unmatched students didn't go to college; we'll put them down for Boston. (Happy that this matches the county data from before!)

count if _merge==1 & ID_FSC!=""
tab ipeds_stateab if _merge==1 & ID_FSC!=""
* 3,674 unmatched students report going to college but aren't represented in our dataset. We want to see whether they're in the Northeast or not.

/*
ipeds_state |
         ab |      Freq.     Percent        Cum.
------------+-----------------------------------
         AK |          1        0.03        0.03
         AL |         28        0.83        0.86
         AR |          2        0.06        0.92
         AZ |         22        0.65        1.57
         CA |         94        2.79        4.36
         CO |         24        0.71        5.08
         CT |         19        0.56        5.64
         DC |         84        2.49        8.13
         DE |         15        0.45        8.58
         FL |        154        4.57       13.15
         GA |         77        2.29       15.43
         HI |          2        0.06       15.49
         IA |         13        0.39       15.88
         ID |          9        0.27       16.15
         IL |         45        1.34       17.48
         IN |          8        0.24       17.72
         KS |          1        0.03       17.75
         KY |         24        0.71       18.46
         LA |         26        0.77       19.23
         MA |      1,483       44.02       63.25
         MD |         67        1.99       65.24
         MI |         22        0.65       65.89
         MN |         15        0.45       66.34
         MO |          8        0.24       66.58
         MS |          3        0.09       66.67
         MT |          6        0.18       66.84
         NC |         97        2.88       69.72
         ND |          2        0.06       69.78
         NJ |         37        1.10       70.88
         NM |          4        0.12       71.00
         NV |          4        0.12       71.12
         NY |        467       13.86       84.98
         OH |         81        2.40       87.38
         OK |          8        0.24       87.62
         OR |          6        0.18       87.80
         PA |        140        4.16       91.96
         PR |          5        0.15       92.10
         RI |         35        1.04       93.14
         SC |         33        0.98       94.12
         SD |          2        0.06       94.18
         TN |         15        0.45       94.63
         TX |         33        0.98       95.61
         UT |         13        0.39       95.99
         VA |         92        2.73       98.72
         WA |         16        0.47       99.20
         WI |          9        0.27       99.47
         WV |         17        0.50       99.97
         WY |          1        0.03      100.00
------------+-----------------------------------
      Total |      3,369      100.00

*/
tab college_name if _merge==1 & ID_FSC!="" & (ipeds_stateab=="CT"|ipeds_stateab=="MA"|ipeds_stateab=="RI")

/*
                           college_name |      Freq.     Percent        Cum.
----------------------------------------+-----------------------------------
                         BOSTON COLLEGE |        112        7.29        7.29
             CAPE COD COMMUNITY COLLEGE |        234       15.22       22.51
   EASTERN CONNECTICUT STATE UNIVERSITY |          2        0.13       22.64
                   FAIRFIELD UNIVERSITY |          1        0.07       22.71
    MASSACHUSETTS BAY COMMUNITY COLLEGE |        269       17.50       40.21
         MASSACHUSETTS MARITIME ACADEMY |         30        1.95       42.16
                      MERRIMACK COLLEGE |          1        0.07       42.23
         NORTHWESTERN COMMUNITY COLLEGE |          1        0.07       42.29
                     PINE MANOR COLLEGE |         18        1.17       43.46
     QUINEBAUG VALLEY COMMUNITY COLLEGE |          1        0.07       43.53
              ROXBURY COMMUNITY COLLEGE |        353       22.97       66.49
       SOUTHEASTERN TECHNICAL INSTITUTE |          1        0.07       66.56
              UNIVERSITY OF CONNECTICUT |         15        0.98       67.53
     UNIVERSITY OF MASSACHUSETTS BOSTON |          1        0.07       67.60
     UNIVERSITY OF MASSACHUSETTS LOWELL |          1        0.07       67.66
  UNIVERSITY OF MASSACHUSETTS-DARTMOUTH |        462       30.06       97.72
             UNIVERSITY OF RHODE ISLAND |         35        2.28      100.00
----------------------------------------+-----------------------------------
                                  Total |      1,537      100.00

*/


* Pull unmatched students into a data file and add the correct town name that matches Dave Leip.
* The following colleges get assigned to surrounding county because their town does not appear in the Dave Leip data.
*	NORTHWESTERN COMMUNITY COLLEGE
*	PINE MANOR COLLEGE 
* 	QUINEBAUG VALLEY COMMUNITY COLLEGE
* 	UNIVERSITY OF CONNECTICUT
* 	UNIVERSITY OF RHODE ISLAND
* 	UNIVERSITY OF MASSACHUSETTS-DARTMOUTH

save "$temp\sas_fsc_ipeds_town_merge.dta", replace 

clear
use "$temp\sas_fsc_ipeds_town_merge.dta"

* Save the observations for whom the merge worked separately.
keep if _merge==3
drop _merge
save "$temp\sas_fsc_ipeds_town_merge3.dta", replace

* Save students who didn't attend college separately 
clear
use "$temp\sas_fsc_ipeds_town_merge.dta"
keep if _merge==1 & ID_FSC==""
drop _merge
replace fips=25025
replace stateab="MA"
replace town_name="Boston"
save "$temp\sas_fsc_ipeds_town_nocollege.dta", replace

* Save students at ITT Technical Institute, University of Phoenix, and all Everest institutions, who are identified as being located in Boston (likely taking courses online), separately.
clear
use "$temp\sas_fsc_ipeds_town_merge.dta"
keep if strpos(college_name, "ITT T")|strpos(college_name, "EVEREST")|strpos(college_name, "PHOENIX")
drop _merge
replace fips=25025
replace ipeds_stateab="MA"
replace stateab="MA"
replace town_name="Boston"
save "$temp\sas_fsc_ipeds_town_online.dta", replace

* Save unmatched students who went to school in the Northeast separately 
clear
use "$temp\sas_fsc_ipeds_town_merge.dta"
keep if _merge==1 & ID_FSC!="" & (ipeds_stateab=="CT"|ipeds_stateab=="MA"|ipeds_stateab=="RI")
drop _merge
save "$temp\sas_fsc_ipeds_town_tomatch.dta", replace
keep sasid college_name ipeds_city ipeds_stateab ipeds_zip year4
save "$temp\sas_fsc_ipeds_town_tomatchslim.dta", replace

sort college_name
drop ipeds_city ipeds_zip
export excel using "$temp\sas_fsc_ipeds_town_tomatch.xls", sheetreplace

* Manually added state and town names by hand in this Excel file 
import excel "$temp\sas_fsc_ipeds_town_tomatch_withtown.xls", sheet("Sheet1") firstrow clear
format %12.0g sasid

merge 1:1 sasid using "$temp\sas_fsc_ipeds_town_tomatch.dta"
drop _merge

* Append our online and non-college students (who are all in Boston)
append using "$temp\sas_fsc_ipeds_town_online.dta"
append using "$temp\sas_fsc_ipeds_town_nocollege.dta"
keep sasid college_name year4 ipeds_stateab zip town_name stateab ID_FSC year_elec ID_IPEDS ipeds_ownership ipeds_statefips

merge m:m town_name stateab year4 using "$temp\town.dta"
keep if _merge==3

* Add our initial successful merges in
append using "$temp\sas_fsc_ipeds_town_merge3.dta"
* 17,691 observations--we only lose about 2,000 people from our file!

keep sasid ID_FSC college_name year_elec year4 ID_IPEDS ipeds_ownership town_name stateab totalpop totalvap totalreg ballots turnout democrat republican independent statefips countyfips fips turnout_total turnout_reg reg_total activereg 

save "$data\sas_fsc_ipeds_town_full.dta", replace

keep sasid year_elec ID_FSC ID_IPEDS ipeds_ownership college_name turnout_total turnout_reg reg_total
save "$data\sas_fsc_ipeds_town_slim.dta", replace


***********************************************************
* STRIP AND MERGE TOWN-LEVEL & COUNTY-LEVEL TO LOTTO FILE *
***********************************************************

clear
use "$data\sas_fsc_ipeds_town_slim.dta"

keep sasid turnout_reg 
rename turnout_reg turnout_reg_town
duplicates drop sasid, force

merge 1:1 sasid using "$temp\sas_fsc_ipeds_county_slim.dta", nogen
rename turnout_reg turnout_reg_county
rename turnout_total turnout_total_county 
rename reg_total reg_total_county 
rename turnout_vap turnout_vap_county 
rename reg_vap reg_vap_county 
rename totalvap_census totalvap_census_county

* Set voting rate to county-level if missing for town-level (non-NE students)
gen turnout_reg = turnout_reg_town
replace turnout_reg = turnout_reg_county if turnout_reg_town==.

save "$data\dave_leip.dta", replace

